Slip 2


Q.1. Write a python program to implement simple Linear Regression for predicting house   
price. First find all null values in a given dataset and remove them. 

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Step 1: Load the dataset
# (You can replace 'house_prices.csv' with your actual file name)
df = pd.read_csv("house_prices.csv")

print("✅ Dataset loaded successfully!\n")
print("First 5 Rows:\n", df.head(), "\n")

# Step 2: Check for null (missing) values
print("🔍 Checking for Null Values:\n", df.isnull().sum(), "\n")

# Step 3: Remove rows with any null values
df_cleaned = df.dropna()
print("✅ After Removing Null Values, Dataset Shape:", df_cleaned.shape, "\n")

# Step 4: Define features (X) and target (y)
# Assuming dataset has columns like 'Area' (sq ft) and 'Price'
# Modify column names as per your dataset
X = df_cleaned[['Area']]   # independent variable
y = df_cleaned['Price']    # dependent variable

# Step 5: Split the dataset into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Create and train the Simple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("📊 Model Evaluation:")
print("Mean Squared Error:", round(mse, 2))
print("R² Score:", round(r2, 4))
print("Intercept (b0):", round(model.intercept_, 2))
print("Slope (b1):", round(model.coef_[0], 2))

# Step 9: Plot the regression line
plt.scatter(X_test, y_test, color='blue', label='Actual Prices')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted Line')
plt.title("Simple Linear Regression - House Price Prediction")
plt.xlabel("Area (sq ft)")
plt.ylabel("Price")
plt.legend()
plt.show()

Q.2. The data set refers to clients of a wholesale distributor. It includes the annual  
spending in monetary units on diverse product categories. Using data Wholesale  
customer dataset compute agglomerative clustering to find out annual spending  
clients in the same region.  

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# Load dataset
# Wholesale customers dataset: https://archive.ics.uci.edu/ml/datasets/wholesale+customers
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv"
data = pd.read_csv(url)

print("Dataset shape:", data.shape)
print(data.head())

# Select only the annual spending features
X = data.drop(["Channel", "Region"], axis=1)

# Standardize the data (important for clustering)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create Agglomerative Clustering model
agg = AgglomerativeClustering(n_clusters=3, linkage='ward')
labels = agg.fit_predict(X_scaled)

# Add cluster labels to dataset
data['Cluster'] = labels

print("\nClustered Data Sample:")
print(data.head())

# Plot dendrogram
linked = linkage(X_scaled, 'ward')

plt.figure(figsize=(10, 5))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=False)
plt.title("Dendrogram - Agglomerative Clustering")
plt.show()

# Visualize two dimensions (e.g., Fresh vs Milk)
plt.figure(figsize=(8,6))
plt.scatter(data['Fresh'], data['Milk'], c=data['Cluster'], cmap='rainbow')
plt.xlabel("Fresh")
plt.ylabel("Milk")
plt.title("Agglomerative Clustering (Fresh vs Milk)")
plt.show()
